In [1]:
#importing libraries

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
Bad key text.latex.preview in file /opt/anaconda3/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle, line 123 ('text.latex.preview : False')
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.5.0/matplotlibrc.template
or from the matplotlib source distribution

Bad key mathtext.fallback_to_cm in file /opt/anaconda3/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle, line 155 ('mathtext.fallback_to_cm : True  # When True, use symbols from the Computer Modern')
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.5.0/matplotlibrc.template
or from the matplotlib source distribution

Bad key savefig.jpeg_quality in file /opt/anaconda3/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle, line 418 ('savefig.jpeg_quality: 95       # when a jpeg is saved, the default quality parameter.')
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.5.0/matplotlibrc.template
or from the matplotlib source distribution

Bad key keymap.all_axes in file /opt/anaconda3/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle, line 466 ('keymap.all_axes : a                 # enable all axes')
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.5.0/matplotlibrc.template
or from the matplotlib source distribution

Bad key animation.avconv_path in file /opt/anaconda3/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle, line 477 ('animation.avconv_path: avconv     # Path to avconv binary. Without full path')
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.5.0/matplotlibrc.template
or from the matplotlib source distribution

Bad key animation.avconv_args in file /opt/anaconda3/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle, line 479 ('animation.avconv_args:            # Additional arguments to pass to avconv')
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.5.0/matplotlibrc.template
or from the matplotlib source distribution
In [2]:
# creating back up file 

loan_data_backup = pd.read_csv('/Users/vladimirant/Desktop/Walmart Data Science Interview/Data Science HW design val.csv')
In [3]:
# creating working data file


loan_data_train=loan_data_backup.copy()
In [4]:
loan_data_train.head()
Out[4]:
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 ... A22 A23 A24 A25 A26 A27 A28 A29 A30 default
0 22.88 75.59 3367.08 6131.31 41.38 43.08 2.13 102.67 0.500 0.500 ... 0.08 0.00 0.05 0.17 0.00 1035.14 158.71 13.28 0.00 1
1 15.94 86.26 5595.00 10867.86 52.29 61.21 3.05 124.94 3.200 2.400 ... 0.01 0.00 0.00 0.06 438.50 0.00 72.78 44.88 25.39 1
2 25.16 64.20 4758.44 7818.15 46.28 50.72 2.69 110.16 1.124 0.889 ... 0.08 0.02 0.02 0.10 622.70 755.52 102.89 6.24 0.00 1
3 19.50 77.81 5762.27 6290.00 58.14 76.27 2.44 119.92 1.222 1.000 ... 0.04 0.00 0.01 0.08 197.55 396.27 76.34 8.44 2.00 1
4 11.11 85.09 11400.50 20936.25 30.00 29.75 0.50 125.89 2.167 1.333 ... 0.00 0.00 0.00 0.04 0.00 0.00 0.00 0.00 0.00 1

5 rows × 31 columns

In [5]:
loan_data_train.tail()
Out[5]:
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 ... A22 A23 A24 A25 A26 A27 A28 A29 A30 default
11495 14.55 88.05 8492.47 28646.00 46.15 49.21 0.40 90.21 1.615 1.231 ... 0.00 0.00 0.00 0.04 0.88 72.39 18.19 10.84 0.00 0
11496 17.64 79.79 6303.06 8729.09 49.82 50.06 2.13 107.69 1.667 1.500 ... 0.11 0.01 0.01 0.13 269.84 449.14 2.18 22.84 21.80 0
11497 28.31 74.50 5713.82 10946.15 38.15 33.27 3.29 102.87 1.250 1.000 ... 0.03 0.00 0.00 0.10 0.00 201.04 12.08 6.98 0.00 0
11498 16.54 77.79 3028.71 4775.00 27.38 39.43 3.47 75.11 0.800 0.800 ... 0.00 0.00 0.00 0.06 237.75 0.00 10.56 7.62 9.38 0
11499 19.92 83.00 3105.63 6366.67 44.44 66.13 1.21 150.70 1.000 1.000 ... 0.06 0.00 0.02 0.11 72.11 841.82 104.60 98.00 0.00 0

5 rows × 31 columns

In [6]:
#getting information about data types of the variables

#all the variables are float or integer format, which is OK

loan_data_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11500 entries, 0 to 11499
Data columns (total 31 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   A1       11500 non-null  float64
 1   A2       11500 non-null  float64
 2   A3       11500 non-null  float64
 3   A4       11500 non-null  float64
 4   A5       11500 non-null  float64
 5   A6       11500 non-null  float64
 6   A7       11500 non-null  float64
 7   A8       11500 non-null  float64
 8   A9       11500 non-null  float64
 9   A10      11500 non-null  float64
 10  A11      11500 non-null  float64
 11  A12      11500 non-null  float64
 12  A13      11500 non-null  float64
 13  A14      11500 non-null  float64
 14  A15      11500 non-null  float64
 15  A16      11500 non-null  float64
 16  A17      11500 non-null  float64
 17  A18      11500 non-null  float64
 18  A19      11500 non-null  float64
 19  A20      11500 non-null  float64
 20  A21      11500 non-null  int64  
 21  A22      11500 non-null  float64
 22  A23      11500 non-null  float64
 23  A24      11500 non-null  float64
 24  A25      11500 non-null  float64
 25  A26      11500 non-null  float64
 26  A27      11500 non-null  float64
 27  A28      11500 non-null  float64
 28  A29      11500 non-null  float64
 29  A30      11500 non-null  float64
 30  default  11500 non-null  int64  
dtypes: float64(29), int64(2)
memory usage: 2.7 MB
In [10]:
loan_data_train.describe()
Out[10]:
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 ... A22 A23 A24 A25 A26 A27 A28 A29 A30 default
count 11500.000000 11500.000000 11500.000000 11500.000000 11500.000000 11500.000000 11500.000000 11500.000000 11500.000000 11500.000000 ... 11500.000000 11500.000000 11500.000000 11500.000000 11500.000000 11500.000000 11500.000000 11500.000000 11500.000000 11500.000000
mean 12.912370 82.996250 7121.193469 12969.915071 34.567032 38.290894 1.608435 121.486797 1.642194 1.363613 ... 0.077656 0.009178 0.014158 0.135987 289.664904 353.898283 102.034951 27.673394 40.731303 0.130435
std 7.035279 9.017252 2673.425470 7960.762380 12.484591 15.023454 1.042281 33.738120 0.732355 0.598877 ... 0.153989 0.024546 0.030284 0.158443 520.792137 584.500018 131.415911 53.560611 156.780339 0.336796
min 0.000000 16.710000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 -0.010000 -0.010000 0.000000 -559.990000 -140.940000 -187.120000 -447.680000 -223.890000 0.000000
25% 8.210000 77.947500 5318.782500 8294.440000 27.477500 29.447500 0.890000 98.822500 1.200000 1.000000 ... 0.010000 0.000000 0.000000 0.060000 0.000000 0.000000 14.180000 0.000000 0.000000 0.000000
50% 12.550000 83.580000 7129.690000 11471.620000 33.360000 36.690000 1.500000 124.890000 1.632000 1.374000 ... 0.040000 0.000000 0.010000 0.090000 111.090000 104.725000 60.015000 10.040000 0.000000 0.000000
75% 17.000000 89.150000 8635.810000 15465.432500 40.640000 45.800000 2.130000 141.450000 2.000000 1.667000 ... 0.080000 0.010000 0.010000 0.140000 377.125000 482.075000 139.065000 34.030000 49.945000 0.000000
max 100.000000 100.000000 26862.500000 132787.500000 128.000000 128.000000 11.080000 319.670000 10.000000 6.000000 ... 4.920000 0.680000 0.770000 2.340000 26625.900000 9094.610000 1682.140000 1312.530000 14423.670000 1.000000

8 rows × 31 columns

In [6]:
#Checking for missing values
pd.options.display.max_rows = None
# Sets the pandas dataframe options to display all columns/ rows.
loan_data_train.isnull().sum()

#NO missinng values
Out[6]:
A1         0
A2         0
A3         0
A4         0
A5         0
A6         0
A7         0
A8         0
A9         0
A10        0
A11        0
A12        0
A13        0
A14        0
A15        0
A16        0
A17        0
A18        0
A19        0
A20        0
A21        0
A22        0
A23        0
A24        0
A25        0
A26        0
A27        0
A28        0
A29        0
A30        0
default    0
dtype: int64
In [7]:
# Check for unique values of the variables

for i in loan_data_train.columns:
    print(i,len(loan_data_train[i].unique()),loan_data_train[i].unique())
A1 2652 [22.88 15.94 25.16 ...  4.85 28.5  28.31]
A2 3245 [75.59 86.26 64.2  ... 98.89 95.2  77.51]
A3 10932 [3367.08 5595.   4758.44 ... 5713.82 3028.71 3105.63]
A4 10188 [ 6131.31 10867.86  7818.15 ...  8729.09 10946.15  4775.  ]
A5 3863 [41.38 52.29 46.28 ... 45.4  23.96 14.76]
A6 4224 [43.08 61.21 50.72 ... 37.73 56.07 50.06]
A7 551 [ 2.13  3.05  2.69  2.44  0.5   3.88  1.25  2.23  1.67  2.38  1.41  2.87
  6.81  4.56  1.78  1.3   2.82  4.22  1.9   4.    2.62  2.63  1.88  1.59
  0.71  2.67  1.45  2.88  2.6   0.94  1.93  1.2   1.96  1.94  3.74  1.29
  2.5   2.    0.    0.06  2.86  2.53  2.58  1.23  3.68  2.9   0.07  3.09
  2.28  4.5   2.11  1.75  3.22  1.53  2.27  1.95  2.78  2.4   2.54  2.31
  1.63  2.17  0.25  2.14  1.85  1.5   2.33  3.44  1.47  2.51  1.49  1.8
  1.12  1.69  2.35  0.73  3.7   1.76  3.43  2.39  3.18  4.14  2.85  2.01
  1.89  3.89  0.97  1.37  1.38  3.32  3.13  1.13  3.02  1.54  1.27  2.25
  0.27  1.21  0.83  1.09  3.57  2.79  3.69  3.37  3.    2.05  1.48  3.65
  1.52  4.17  4.23  2.83  4.06  1.07  2.64  6.33  2.73  3.42  1.32  3.4
  5.6   4.7   2.52  2.76  0.75  4.05  5.85  1.74  1.17  1.33  4.6   2.93
  0.8   0.84  3.31  0.43  2.08  3.78  0.86  1.61  2.26  3.83  0.17  5.73
  1.44  1.22  2.04  0.52  3.79  2.81  1.71  2.3   0.89  3.04  3.08  2.19
  2.49  1.55  3.35  5.1   4.67  4.84  1.43  0.92  2.15  3.71  4.78  1.82
  1.64  1.24  4.19  2.41  2.56  3.11  3.2   1.1   0.67  4.26  1.86  2.09
  2.47  1.98  2.94  3.19  3.75  1.    0.88  3.5   6.67  0.13  3.39  2.2
  0.46  2.95  6.75  3.92  1.84  3.1   1.34  1.4   1.31  3.93  1.91  3.33
  0.87  3.25  3.53  4.65  2.03  2.06  4.27  4.1   3.07  1.14  1.06  4.8
  1.36  2.42  3.95  2.29  2.37  0.49  1.92  0.81  1.83  1.81  3.64  3.82
  0.93  1.68  2.32  0.65  1.16  2.1   4.38  2.55  1.87  2.61  2.7   2.75
  2.36  2.24  1.05  3.3   2.66  2.59  3.26  0.55  2.92  3.36  2.57  1.28
  2.89  2.74  2.22  3.47  1.79  0.78  1.56  1.7   0.41  1.62  0.82  0.69
  5.12  5.32  7.07  2.02  2.97  7.14  2.8   1.35  2.45  3.01  0.56  2.18
  0.77  1.65  6.8   2.46  3.21  0.29  0.91  3.59  4.54  0.64  1.66  1.99
  4.75  0.44  2.43  2.98  6.22  0.57  4.61  3.15  2.68  3.94  3.17  3.6
  7.78  4.36  2.07  1.18  3.23  0.98  4.13  0.33  0.7   4.64  0.03  1.58
  1.15  1.11  3.03  3.29  2.12  3.46  2.71  4.29  4.15  7.54  4.18  0.38
  3.72  4.09  3.14  3.27  1.03  1.39  2.96  3.61  3.91  3.84  1.57 11.08
  2.21  2.16  1.6   1.73  5.95  3.28  4.07  4.71  3.63  6.5   1.19  3.38
  4.73  3.76  2.65  4.08  4.47  1.01  0.58  1.46  0.6   3.48  6.78  2.77
  0.9   3.54  3.51  0.95  1.77  3.77  5.55  7.69  4.43  4.2   5.5   5.71
  9.    3.73  4.46  4.3   4.76  3.06  4.82  0.42  0.54  0.68  7.47  5.53
  0.35  0.22  0.79  3.58  7.23  0.53  3.85  5.    1.02  1.08  4.53  0.76
  5.07  3.24  0.85  4.03  3.41  7.35  4.86  6.44  4.44  4.4   1.97  3.81
  6.35  1.04  3.49  2.72  5.17  1.26  1.72  0.62  0.1   0.12  0.37  4.33
  0.11  0.72  0.3   0.09  0.48  2.34  6.96  1.42  0.59  2.48  0.15  0.21
  0.47  0.45  5.27  4.63  0.08  0.28  0.19  0.99  0.61  5.78  0.66  8.
  4.88  0.63  5.44  0.74  0.2   7.    0.04  4.69  3.34  0.36  0.4   0.26
  0.14  0.18  0.31  0.96  3.8   0.23  9.5   4.95  2.84  6.27  3.56  0.05
  4.01  6.19  0.34  0.39  0.16  5.74  4.24  5.65  5.29  5.13  5.83  5.21
  3.86  5.75  1.51  5.3   3.52  3.55  3.16  0.51  3.96  0.24  8.75  5.19
  0.32  5.63  5.48  5.86  3.87  4.39  4.02  4.62  5.33  6.85  3.12  4.28
  6.    4.83  5.69  7.53  5.77  4.51  3.67  3.9   4.79  8.55  4.25  5.38
  4.11  5.43  2.91  9.92  5.8   4.55  6.63  4.45  6.09  4.31  6.08  5.67
  5.76  3.45  6.69  5.03  5.42  3.97  2.99  6.21  7.71  9.43  5.57  5.25
  4.96  4.89  6.9   5.2   4.9   5.05  3.62  6.56  5.7   4.41  4.21]
A8 6734 [102.67 124.94 110.16 ...  90.21 107.69  75.11]
A9 1654 [0.5   3.2   1.124 ... 2.401 4.375 2.527]
A10 1364 [0.5   2.4   0.889 ... 0.3   3.063 1.918]
A11 3255 [75.85 75.   62.57 ... 95.75 76.99 77.96]
A12 1461 [ 9.48 11.72 10.53 ... 19.31 20.4  17.53]
A13 3227 [22.22 15.73 32.24 ... 29.04  4.06 43.47]
A14 3979 [58.46 63.36 62.83 ... 43.84 31.53 69.23]
A15 11091 [ 42131.54 148530.21  39652.58 ... 160962.23  74871.6   29787.82]
A16 180 [0.65 0.33 0.55 0.5  0.   0.46 0.1  0.47 0.79 0.28 0.63 0.2  0.77 0.86
 0.75 0.27 0.56 2.   0.3  0.38 0.42 0.17 0.16 0.9  0.43 0.57 0.24 0.68
 0.12 0.94 0.36 0.19 0.44 1.21 0.32 1.1  0.69 0.6  0.31 0.64 0.13 0.73
 0.21 0.8  0.61 0.29 1.24 0.35 0.51 1.   0.52 0.48 1.12 0.41 0.66 0.58
 0.14 0.23 0.22 0.06 0.88 0.84 0.54 0.25 0.4  0.07 0.37 0.62 0.85 1.13
 0.89 0.08 0.91 0.18 0.09 0.39 0.83 0.53 0.81 1.39 0.71 1.22 0.76 1.09
 1.53 0.59 1.25 0.11 0.72 0.93 1.79 1.06 0.92 0.15 0.7  1.2  0.45 0.26
 0.34 0.78 1.15 3.1  0.67 2.38 1.14 0.82 2.22 0.04 1.26 0.87 1.81 1.29
 1.04 1.38 1.43 1.73 1.45 0.74 1.17 0.03 0.05 1.19 1.63 1.07 1.58 1.56
 1.23 1.05 1.08 1.65 2.8  1.33 3.9  1.11 1.83 1.88 1.47 1.5  1.7  2.11
 0.49 1.3  1.27 1.95 0.96 2.5  0.02 1.28 1.46 3.14 0.95 1.36 1.31 2.03
 1.6  1.44 1.75 1.94 1.64 1.76 1.86 1.69 0.01 1.54 1.01 1.16 1.91 1.4
 1.42 2.17 1.35 3.   4.   3.4  3.33 2.85 1.87 1.67 1.82 2.4 ]
A17 8932 [4005.22 2281.82 3050.54 ... 4412.56 4109.41 3538.95]
A18 129 [0.33 0.38 0.36 0.31 0.   0.42 0.1  0.21 0.23 0.43 0.73 0.29 0.75 0.17
 1.38 0.19 0.08 0.18 0.2  0.09 0.14 0.56 0.16 0.48 0.05 0.35 0.5  0.44
 0.3  0.28 0.22 0.8  0.58 0.62 0.47 0.6  0.55 0.57 1.   0.39 0.34 0.79
 0.4  0.64 0.27 0.13 0.24 0.15 0.25 0.65 0.54 1.67 0.72 0.63 0.26 0.77
 0.11 0.88 0.32 0.46 0.12 0.69 0.71 1.14 0.95 0.83 0.41 1.43 0.53 0.06
 0.51 0.37 0.68 0.45 0.59 0.78 0.81 1.06 0.89 1.25 0.91 0.07 0.76 1.13
 0.7  1.33 0.52 0.84 1.11 1.63 0.67 0.04 0.86 0.85 0.82 1.75 0.92 2.
 0.03 1.22 1.21 0.93 2.8  0.61 1.44 1.5  0.87 1.2  0.66 0.9  0.49 1.17
 1.4  0.02 0.94 1.07 1.19 1.08 1.27 1.6  1.3  1.18 0.74 1.12 2.53 1.83
 1.15 0.01 1.8 ]
A19 112 [0.33 0.   0.25 0.27 0.1  0.36 0.4  0.5  0.07 0.71 0.14 0.13 0.11 0.64
 0.08 0.22 0.29 0.41 0.19 0.09 0.65 0.3  0.17 0.39 0.24 0.2  0.38 0.12
 0.77 0.15 0.21 0.16 0.67 0.23 0.44 0.55 0.05 0.28 0.45 0.06 0.37 0.18
 0.88 0.35 0.04 0.63 0.43 0.47 0.31 0.53 0.46 0.58 1.   0.49 0.81 0.57
 1.2  2.44 0.42 0.73 0.89 0.02 0.26 0.83 1.5  0.85 0.6  0.78 1.25 0.32
 0.7  0.54 0.56 0.52 0.86 0.75 1.46 1.08 1.8  0.48 0.92 1.38 0.82 1.6
 2.   1.11 1.14 0.03 0.84 0.8  2.83 0.59 1.15 1.33 1.13 0.01 0.62 0.68
 1.16 1.85 0.87 1.21 1.44 0.9  0.34 1.4  3.   2.25 0.69 2.75 0.74 2.67]
A20 397 [1.25 2.37 1.51 2.69 2.35 3.33 2.32 0.78 1.56 1.41 1.54 1.42 2.61 1.67
 0.8  2.2  0.94 0.89 2.8  1.13 1.57 1.31 1.63 2.54 1.73 2.93 1.43 1.29
 1.93 2.07 2.43 3.88 1.97 2.   0.5  1.24 1.28 0.29 1.72 1.9  1.79 1.39
 2.83 1.15 1.64 1.33 0.57 2.09 1.74 1.17 2.25 1.53 3.09 1.06 2.67 1.36
 3.58 1.8  4.18 1.22 1.4  0.88 0.86 1.18 2.94 1.86 2.33 1.83 2.12 2.39
 1.04 2.4  1.95 2.05 2.21 2.14 2.26 1.11 3.23 1.21 1.19 3.47 1.32 2.28
 3.81 1.75 3.38 1.5  1.6  3.65 1.89 0.67 1.94 1.14 1.78 1.61 1.98 3.46
 1.34 2.89 2.7  1.68 2.5  2.06 1.66 2.17 2.3  2.58 2.75 2.1  1.44 2.45
 1.62 2.29 2.27 1.1  1.07 1.81 2.24 1.46 1.7  4.22 2.46 0.76 2.65 3.
 1.85 3.5  2.87 2.22 1.23 2.04 1.91 1.   2.42 1.49 3.25 1.27 2.18 2.86
 2.44 2.81 3.75 1.76 1.38 3.05 3.07 1.87 2.19 1.45 3.95 2.15 1.59 3.62
 2.16 3.29 2.88 0.25 3.93 1.47 1.88 1.35 2.08 1.58 2.57 2.64 3.42 3.04
 0.83 2.38 0.84 4.88 2.63 1.69 1.55 2.13 1.3  2.71 3.41 3.86 0.97 4.33
 1.48 3.8  1.37 2.77 2.56 1.96 0.9  3.35 2.11 4.4  1.65 3.82 1.26 0.75
 3.37 0.82 1.92 2.36 1.52 0.53 2.48 0.87 2.52 0.93 1.16 3.89 3.2  1.84
 0.92 2.76 2.41 3.4  2.62 4.17 2.55 2.23 1.71 2.68 3.27 1.77 3.17 2.79
 2.74 1.02 3.06 3.08 0.36 2.47 3.21 4.04 3.52 0.79 1.03 0.62 2.82 0.45
 1.2  0.52 2.85 0.81 1.09 3.71 3.22 1.82 2.91 1.12 2.73 0.68 0.6  4.
 0.56 0.91 2.72 3.39 4.79 3.32 3.19 2.03 1.08 0.47 0.19 0.33 2.95 3.14
 3.44 3.43 3.16 0.77 0.85 2.53 2.34 0.44 0.71 2.92 5.2  0.74 2.6  3.61
 3.45 3.67 0.46 4.73 2.78 0.7  0.65 0.95 3.28 0.13 3.11 0.4  2.59 3.6
 3.3  3.24 1.05 0.22 0.42 3.15 3.1  0.63 0.54 0.58 0.38 3.13 0.99 4.5
 1.01 2.01 2.9  3.87 0.64 0.69 5.   0.14 0.43 0.98 0.96 2.96 4.2  3.48
 2.49 0.55 3.57 2.51 0.   4.08 4.9  0.73 3.64 0.21 0.3  2.02 3.56 0.61
 3.36 0.37 3.85 0.72 0.49 0.31 1.99 2.31 0.59 0.2  3.18 3.73 4.36 3.12
 3.55 4.09 0.11 2.98 5.81 2.66 0.24 4.26 3.78 6.27 3.03 3.53 0.35 4.74
 2.97 0.23 3.72 0.32 4.75 4.15 0.17 4.29 3.63 0.1  0.16 0.39 3.83 0.08
 3.77 0.66 0.27 3.9  5.3  4.44 4.23 0.18 4.14 2.84 4.67 4.55 4.56 3.91
 4.16 5.44 0.28 0.26 5.33]
A21 349 [   28    64    55   283    26    87    50    29    77    48   106    43
    13     5    36    24    42    31    18    56    44    14    34    22
   178   167     7    37    33   103     6    17   136    49    19    35
   297    16    57    47    66    39   129   139    40     4    69   132
    15   226    71    41    32    95    21    68    52   102    20    81
   186    25     9    75   104   150    45    51   105    98     1     8
    30    70    79   148    23    11    83    91    12    76    53    54
    58   140    27    65   122    93   131    59    99    10    62    90
    96    67   134    60    46   120   154    92   119     2   137   365
   211   130   197    80   121    38     3   116   262    78   127   101
    73   117   100    88   224   128    84   255   200   258   234   115
   152   432    85    63   111   138   108    61    89    82   222    74
   125   298   177   107   272    86   110   384   214   205   259   113
   118   114   410   145    72   146    97   151   123   264   144   221
   311   274   112   285   126   345   188   159   213   161   507   141
   227   168   160   109   156    94   164   406   174   124   147   207
   261   194   308   171   143   400   244   195   218   238   240   201
   435   165   855   191   184   192   175   172   217   176   518   180
   271   357   223   276   166   516   182   279   183   163   290   173
   185   158   170   249   153   157   187   199   538   230   193   371
   215   350   142   209   242   282   208   231   501   202   135   225
   235   265   521   155   296   386   425   254   325   281   360   368
   358   196   246   189   318   267   511   542   169   181   228   536
   473   203   149   275   448   328   327   204   219   353   277   332
   229   341   466   460   266   198   492   247   278   481   232   133
   462   317   269   239   263   334   220   320   413   498   395 18288
   212   233   447   326   190   402   179   440   469   373   515   206
   437   359   598   268   427   273   216   251   294   315   210   300
   280   306   411   256   355   585   527   248   361   291   342   428
   458]
A22 137 [0.08 0.01 0.04 0.   0.03 0.05 0.07 0.12 0.02 0.06 0.14 0.13 0.51 0.1
 0.33 0.25 0.19 0.37 0.11 0.09 0.15 0.32 0.64 0.16 0.21 0.26 0.18 0.17
 0.22 0.39 0.81 0.41 0.34 0.24 0.55 1.35 0.88 0.56 0.2  0.35 0.23 0.3
 0.36 0.46 0.28 0.38 0.4  0.31 0.45 0.8  0.49 0.43 0.47 0.78 0.5  0.42
 0.57 1.25 1.14 0.63 0.82 1.17 0.27 0.73 0.84 0.83 0.7  0.85 0.74 0.29
 0.67 0.53 0.58 0.6  1.82 1.96 0.72 1.12 0.61 0.44 0.87 0.62 0.92 0.48
 0.65 1.64 0.98 1.04 1.26 1.1  0.75 1.61 0.89 0.52 1.02 0.77 1.05 0.54
 1.81 2.06 0.86 1.16 1.23 1.   0.76 1.27 1.42 0.9  0.71 1.15 0.59 1.46
 1.53 0.94 0.66 1.95 0.68 1.18 1.47 0.99 0.91 4.92 0.79 1.19 2.5  1.11
 1.07 2.2  1.09 1.49 0.69 1.21 1.87 1.67 4.   2.69 0.93]
A23 39 [ 0.    0.02  0.01  0.05  0.03  0.13  0.04  0.07  0.09  0.06  0.12  0.18
  0.16  0.26  0.15  0.2   0.08  0.1   0.11  0.14  0.42  0.31  0.21  0.19
  0.29  0.25  0.28  0.57  0.17  0.32  0.33  0.44  0.68 -0.01  0.23  0.24
  0.22  0.34  0.52]
A24 43 [ 0.05  0.    0.02  0.01  0.03  0.12  0.07  0.04  0.08  0.13  0.16  0.09
  0.06  0.21  0.11  0.14  0.1   0.18  0.39  0.34  0.25  0.31  0.17  0.27
  0.15  0.22  0.19  0.28  0.41 -0.01  0.2   0.24  0.36  0.37  0.23  0.32
  0.38  0.77  0.26  0.3   0.52  0.42  0.76]
A25 127 [0.17 0.06 0.1  0.08 0.04 0.03 0.09 0.07 0.14 0.28 0.11 0.05 0.61 0.13
 0.12 0.75 0.02 0.16 0.26 0.18 0.27 0.52 0.15 0.19 0.21 0.2  0.29 0.01
 0.22 0.48 0.23 0.41 0.71 0.3  0.53 0.84 0.7  0.63 0.59 0.31 0.38 0.43
 0.25 0.5  0.24 0.64 0.91 0.33 0.35 0.39 0.37 0.44 0.6  0.76 0.58 0.32
 0.34 0.45 0.55 0.82 1.25 0.49 0.4  0.56 0.42 0.78 0.36 0.83 0.47 0.
 0.51 0.65 0.74 0.81 0.54 0.99 0.67 0.73 0.68 0.46 0.96 0.95 1.03 0.62
 0.88 1.14 1.21 0.69 0.97 1.05 0.85 1.43 0.86 0.72 0.77 1.2  0.98 1.12
 1.17 0.57 1.41 2.34 1.47 1.09 0.94 0.9  0.79 1.26 1.23 1.22 1.13 0.66
 0.89 0.8  0.87 1.02 1.07 0.92 1.38 1.04 1.42 1.46 1.56 1.   1.01 0.93
 1.33]
A26 7594 [  0.   438.5  622.7  ... 269.84 237.75  72.11]
A27 6929 [1035.14    0.    755.52 ...  449.14  201.04  841.82]
A28 7927 [158.71  72.78 102.89 ...  92.65  18.19  10.56]
A29 4605 [ 13.28  44.88   6.24 ... 163.45  10.84  22.84]
A30 4445 [ 0.   25.39  2.   ... 96.13 21.8   9.38]
default 2 [1 0]
In [8]:
loan_data_train["A24"].unique()
Out[8]:
array([ 0.05,  0.  ,  0.02,  0.01,  0.03,  0.12,  0.07,  0.04,  0.08,
        0.13,  0.16,  0.09,  0.06,  0.21,  0.11,  0.14,  0.1 ,  0.18,
        0.39,  0.34,  0.25,  0.31,  0.17,  0.27,  0.15,  0.22,  0.19,
        0.28,  0.41, -0.01,  0.2 ,  0.24,  0.36,  0.37,  0.23,  0.32,
        0.38,  0.77,  0.26,  0.3 ,  0.52,  0.42,  0.76])
In [9]:
loan_data_train["A22"].unique()
Out[9]:
array([0.08, 0.01, 0.04, 0.  , 0.03, 0.05, 0.07, 0.12, 0.02, 0.06, 0.14,
       0.13, 0.51, 0.1 , 0.33, 0.25, 0.19, 0.37, 0.11, 0.09, 0.15, 0.32,
       0.64, 0.16, 0.21, 0.26, 0.18, 0.17, 0.22, 0.39, 0.81, 0.41, 0.34,
       0.24, 0.55, 1.35, 0.88, 0.56, 0.2 , 0.35, 0.23, 0.3 , 0.36, 0.46,
       0.28, 0.38, 0.4 , 0.31, 0.45, 0.8 , 0.49, 0.43, 0.47, 0.78, 0.5 ,
       0.42, 0.57, 1.25, 1.14, 0.63, 0.82, 1.17, 0.27, 0.73, 0.84, 0.83,
       0.7 , 0.85, 0.74, 0.29, 0.67, 0.53, 0.58, 0.6 , 1.82, 1.96, 0.72,
       1.12, 0.61, 0.44, 0.87, 0.62, 0.92, 0.48, 0.65, 1.64, 0.98, 1.04,
       1.26, 1.1 , 0.75, 1.61, 0.89, 0.52, 1.02, 0.77, 1.05, 0.54, 1.81,
       2.06, 0.86, 1.16, 1.23, 1.  , 0.76, 1.27, 1.42, 0.9 , 0.71, 1.15,
       0.59, 1.46, 1.53, 0.94, 0.66, 1.95, 0.68, 1.18, 1.47, 0.99, 0.91,
       4.92, 0.79, 1.19, 2.5 , 1.11, 1.07, 2.2 , 1.09, 1.49, 0.69, 1.21,
       1.87, 1.67, 4.  , 2.69, 0.93])
In [10]:
#Plotting heatmap/correlation table

import seaborn as sb
plt.subplots(figsize=(30,30))
corr = loan_data_train.corr()
sb.heatmap(corr, annot=True,)
Out[10]:
<AxesSubplot:>
In [11]:
#Checking correlation table for highly correlated variables

corr>0.75
Out[11]:
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 ... A22 A23 A24 A25 A26 A27 A28 A29 A30 default
A1 True False False False False False False False False False ... False False False False False False False False False False
A2 False True False False False False False False False False ... False False False False False False False False False False
A3 False False True False False False False False False False ... False False False False False False False False False False
A4 False False False True False False False False False False ... False False False False False False False False False False
A5 False False False False True True False False False False ... False False False False False False False False False False
A6 False False False False True True False False False False ... False False False False False False False False False False
A7 False False False False False False True False False False ... False False False False False False False False False False
A8 False False False False False False False True False False ... False False False False False False False False False False
A9 False False False False False False False False True True ... False False False False False False False False False False
A10 False False False False False False False False True True ... False False False False False False False False False False
A11 False True False False False False False False False False ... False False False False False False False False False False
A12 False False False False False False False True False False ... False False False False False False False False False False
A13 False False False False False False False False False False ... False False False False False False False False False False
A14 False False False False False False False False False False ... False False False False False False False False False False
A15 False False False False False False False False False False ... False False False False False False False False False False
A16 False False False False False False False False False False ... False False False False False False False False False False
A17 False False False False False False False False False False ... False False False False False False False False False False
A18 False False False False False False False False False False ... False False False False False False False False False False
A19 False False False False False False False False False False ... False False False False False False False False False False
A20 False False False False False False False False False False ... False False False False False False False False False False
A21 False False False False False False False False False False ... False False False False False False False False False False
A22 False False False False False False False False False False ... True False True False False False False False False False
A23 False False False False False False False False False False ... False True False False False False False False False False
A24 False False False False False False False False False False ... True False True False False False False False False False
A25 False False False False False False False False False False ... False False False True False False False False False False
A26 False False False False False False False False False False ... False False False False True False False False False False
A27 False False False False False False False False False False ... False False False False False True False False False False
A28 False False False False False False False False False False ... False False False False False False True False False False
A29 False False False False False False False False False False ... False False False False False False False True False False
A30 False False False False False False False False False False ... False False False False False False False False True False
default False False False False False False False False False False ... False False False False False False False False False True

31 rows × 31 columns

In [12]:
#https://www.projectpro.io/recipes/drop-out-highly-correlated-features-in-python

cor_matrix =loan_data_train.corr().abs()
print(cor_matrix)
               A1        A2        A3        A4        A5        A6        A7  \
A1       1.000000  0.765991  0.500946  0.398379  0.524815  0.512808  0.525636   
A2       0.765991  1.000000  0.542537  0.421217  0.515083  0.511911  0.622148   
A3       0.500946  0.542537  1.000000  0.669774  0.542767  0.582183  0.463837   
A4       0.398379  0.421217  0.669774  1.000000  0.340750  0.429054  0.392702   
A5       0.524815  0.515083  0.542767  0.340750  1.000000  0.906412  0.400811   
A6       0.512808  0.511911  0.582183  0.429054  0.906412  1.000000  0.414438   
A7       0.525636  0.622148  0.463837  0.392702  0.400811  0.414438  1.000000   
A8       0.410629  0.409334  0.589492  0.403194  0.468826  0.485549  0.310175   
A9       0.357320  0.440476  0.557666  0.515189  0.321403  0.361232  0.367284   
A10      0.325700  0.417044  0.531078  0.424182  0.303565  0.329261  0.330191   
A11      0.501417  0.785668  0.440930  0.325511  0.392623  0.382044  0.488365   
A12      0.397308  0.402067  0.498547  0.364889  0.425534  0.436526  0.301733   
A13      0.417887  0.421255  0.469657  0.356764  0.706380  0.595350  0.358860   
A14      0.412599  0.414344  0.404430  0.283833  0.698935  0.659413  0.333122   
A15      0.327669  0.398560  0.538158  0.576415  0.202053  0.242988  0.373963   
A16      0.479155  0.370945  0.292726  0.227719  0.253574  0.261308  0.461420   
A17      0.289152  0.302030  0.363149  0.349986  0.271317  0.268889  0.245104   
A18      0.444483  0.351340  0.273880  0.211860  0.231123  0.241580  0.415957   
A19      0.356318  0.277000  0.241928  0.194955  0.208069  0.213475  0.365886   
A20      0.308516  0.246800  0.358312  0.223189  0.380782  0.387012  0.258590   
A21      0.011740  0.010981  0.012382  0.001317  0.000481  0.002093  0.001268   
A22      0.018112  0.054012  0.050737  0.068794  0.031421  0.039143  0.057184   
A23      0.003381  0.016821  0.011325  0.027546  0.001241  0.000640  0.023559   
A24      0.024044  0.058180  0.060504  0.069007  0.036522  0.041524  0.051971   
A25      0.029595  0.038940  0.004648  0.027313  0.013385  0.015482  0.017569   
A26      0.058243  0.040062  0.089724  0.052726  0.048345  0.051419  0.023242   
A27      0.024339  0.027123  0.022616  0.049662  0.031210  0.039601  0.040557   
A28      0.016082  0.019335  0.043113  0.001794  0.017612  0.015154  0.006748   
A29      0.030384  0.034419  0.035448  0.008777  0.014047  0.006894  0.004157   
A30      0.037751  0.003545  0.045847  0.039765  0.016582  0.025539  0.007308   
default  0.379729  0.348172  0.284188  0.205009  0.307176  0.299989  0.263578   

               A8        A9       A10  ...       A22       A23       A24  \
A1       0.410629  0.357320  0.325700  ...  0.018112  0.003381  0.024044   
A2       0.409334  0.440476  0.417044  ...  0.054012  0.016821  0.058180   
A3       0.589492  0.557666  0.531078  ...  0.050737  0.011325  0.060504   
A4       0.403194  0.515189  0.424182  ...  0.068794  0.027546  0.069007   
A5       0.468826  0.321403  0.303565  ...  0.031421  0.001241  0.036522   
A6       0.485549  0.361232  0.329261  ...  0.039143  0.000640  0.041524   
A7       0.310175  0.367284  0.330191  ...  0.057184  0.023559  0.051971   
A8       1.000000  0.349040  0.307173  ...  0.008049  0.018855  0.002216   
A9       0.349040  1.000000  0.901681  ...  0.061827  0.025930  0.067844   
A10      0.307173  0.901681  1.000000  ...  0.051922  0.018882  0.059247   
A11      0.327368  0.361254  0.352447  ...  0.032120  0.000298  0.033808   
A12      0.846586  0.278410  0.238588  ...  0.016165  0.016257  0.007703   
A13      0.369471  0.319507  0.286814  ...  0.026340  0.005352  0.030680   
A14      0.467738  0.122141  0.071365  ...  0.018876  0.004816  0.010160   
A15      0.156540  0.568362  0.562230  ...  0.096043  0.055130  0.105659   
A16      0.196217  0.201322  0.188255  ...  0.008604  0.015718  0.008609   
A17      0.257813  0.329361  0.301454  ...  0.038486  0.014277  0.042907   
A18      0.182768  0.174889  0.164813  ...  0.005322  0.009657  0.003884   
A19      0.160337  0.190374  0.179716  ...  0.010323  0.019085  0.010821   
A20      0.544268  0.152476  0.117396  ...  0.021079  0.030607  0.024445   
A21      0.006114  0.014976  0.021453  ...  0.390178  0.304997  0.311499   
A22      0.008049  0.061827  0.051922  ...  1.000000  0.697363  0.878205   
A23      0.018855  0.025930  0.018882  ...  0.697363  1.000000  0.711752   
A24      0.002216  0.067844  0.059247  ...  0.878205  0.711752  1.000000   
A25      0.002585  0.018899  0.037150  ...  0.644130  0.530874  0.629945   
A26      0.059962  0.065997  0.062349  ...  0.239890  0.516247  0.277336   
A27      0.006095  0.005767  0.008349  ...  0.297063  0.038863  0.326940   
A28      0.057320  0.033927  0.041617  ...  0.293955  0.247596  0.329126   
A29      0.056247  0.039926  0.044128  ...  0.208576  0.166237  0.217147   
A30      0.021121  0.026991  0.016335  ...  0.091805  0.248887  0.135571   
default  0.253846  0.218402  0.202581  ...  0.015739  0.000814  0.018183   

              A25       A26       A27       A28       A29       A30   default  
A1       0.029595  0.058243  0.024339  0.016082  0.030384  0.037751  0.379729  
A2       0.038940  0.040062  0.027123  0.019335  0.034419  0.003545  0.348172  
A3       0.004648  0.089724  0.022616  0.043113  0.035448  0.045847  0.284188  
A4       0.027313  0.052726  0.049662  0.001794  0.008777  0.039765  0.205009  
A5       0.013385  0.048345  0.031210  0.017612  0.014047  0.016582  0.307176  
A6       0.015482  0.051419  0.039601  0.015154  0.006894  0.025539  0.299989  
A7       0.017569  0.023242  0.040557  0.006748  0.004157  0.007308  0.263578  
A8       0.002585  0.059962  0.006095  0.057320  0.056247  0.021121  0.253846  
A9       0.018899  0.065997  0.005767  0.033927  0.039926  0.026991  0.218402  
A10      0.037150  0.062349  0.008349  0.041617  0.044128  0.016335  0.202581  
A11      0.023282  0.046768  0.000304  0.038167  0.046189  0.000764  0.265237  
A12      0.007964  0.051893  0.016695  0.025171  0.039909  0.023357  0.217501  
A13      0.010827  0.042673  0.027089  0.017296  0.012221  0.007792  0.241598  
A14      0.029538  0.030201  0.039988  0.005655  0.015422  0.017979  0.215327  
A15      0.005974  0.036478  0.038082  0.001126  0.013473  0.020310  0.183112  
A16      0.003082  0.044096  0.002541  0.034381  0.022561  0.020068  0.213077  
A17      0.021043  0.052722  0.010839  0.027533  0.028797  0.031417  0.160485  
A18      0.002156  0.037571  0.000865  0.028873  0.016575  0.017670  0.202788  
A19      0.005772  0.043228  0.006263  0.033676  0.029373  0.021555  0.161131  
A20      0.008235  0.054025  0.009739  0.036133  0.031360  0.024343  0.169789  
A21      0.064934  0.101947  0.131065  0.129005  0.108948  0.019865  0.002147  
A22      0.644130  0.239890  0.297063  0.293955  0.208576  0.091805  0.015739  
A23      0.530874  0.516247  0.038863  0.247596  0.166237  0.248887  0.000814  
A24      0.629945  0.277336  0.326940  0.329126  0.217147  0.135571  0.018183  
A25      1.000000  0.082840  0.076721  0.086608  0.061642  0.029160  0.000441  
A26      0.082840  1.000000  0.076863  0.408254  0.261237  0.661799  0.034026  
A27      0.076721  0.076863  1.000000  0.484557  0.340714  0.071456  0.004372  
A28      0.086608  0.408254  0.484557  1.000000  0.341482  0.127322  0.004713  
A29      0.061642  0.261237  0.340714  0.341482  1.000000  0.079655  0.022300  
A30      0.029160  0.661799  0.071456  0.127322  0.079655  1.000000  0.017074  
default  0.000441  0.034026  0.004372  0.004713  0.022300  0.017074  1.000000  

[31 rows x 31 columns]
In [16]:
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))
print(upper_tri)
         A1        A2        A3        A4        A5        A6        A7  \
A1      NaN  0.765991  0.500946  0.398379  0.524815  0.512808  0.525636   
A2      NaN       NaN  0.542537  0.421217  0.515083  0.511911  0.622148   
A3      NaN       NaN       NaN  0.669774  0.542767  0.582183  0.463837   
A4      NaN       NaN       NaN       NaN  0.340750  0.429054  0.392702   
A5      NaN       NaN       NaN       NaN       NaN  0.906412  0.400811   
A6      NaN       NaN       NaN       NaN       NaN       NaN  0.414438   
A7      NaN       NaN       NaN       NaN       NaN       NaN       NaN   
A8      NaN       NaN       NaN       NaN       NaN       NaN       NaN   
A9      NaN       NaN       NaN       NaN       NaN       NaN       NaN   
A10     NaN       NaN       NaN       NaN       NaN       NaN       NaN   
A11     NaN       NaN       NaN       NaN       NaN       NaN       NaN   
A12     NaN       NaN       NaN       NaN       NaN       NaN       NaN   
A13     NaN       NaN       NaN       NaN       NaN       NaN       NaN   
A14     NaN       NaN       NaN       NaN       NaN       NaN       NaN   
A15     NaN       NaN       NaN       NaN       NaN       NaN       NaN   
A16     NaN       NaN       NaN       NaN       NaN       NaN       NaN   
A17     NaN       NaN       NaN       NaN       NaN       NaN       NaN   
A18     NaN       NaN       NaN       NaN       NaN       NaN       NaN   
A19     NaN       NaN       NaN       NaN       NaN       NaN       NaN   
A20     NaN       NaN       NaN       NaN       NaN       NaN       NaN   
A21     NaN       NaN       NaN       NaN       NaN       NaN       NaN   
A22     NaN       NaN       NaN       NaN       NaN       NaN       NaN   
A23     NaN       NaN       NaN       NaN       NaN       NaN       NaN   
A24     NaN       NaN       NaN       NaN       NaN       NaN       NaN   
A25     NaN       NaN       NaN       NaN       NaN       NaN       NaN   
A26     NaN       NaN       NaN       NaN       NaN       NaN       NaN   
A27     NaN       NaN       NaN       NaN       NaN       NaN       NaN   
A28     NaN       NaN       NaN       NaN       NaN       NaN       NaN   
A29     NaN       NaN       NaN       NaN       NaN       NaN       NaN   
A30     NaN       NaN       NaN       NaN       NaN       NaN       NaN   
default NaN       NaN       NaN       NaN       NaN       NaN       NaN   

               A8        A9       A10  ...       A22       A23       A24  \
A1       0.410629  0.357320  0.325700  ...  0.018112  0.003381  0.024044   
A2       0.409334  0.440476  0.417044  ...  0.054012  0.016821  0.058180   
A3       0.589492  0.557666  0.531078  ...  0.050737  0.011325  0.060504   
A4       0.403194  0.515189  0.424182  ...  0.068794  0.027546  0.069007   
A5       0.468826  0.321403  0.303565  ...  0.031421  0.001241  0.036522   
A6       0.485549  0.361232  0.329261  ...  0.039143  0.000640  0.041524   
A7       0.310175  0.367284  0.330191  ...  0.057184  0.023559  0.051971   
A8            NaN  0.349040  0.307173  ...  0.008049  0.018855  0.002216   
A9            NaN       NaN  0.901681  ...  0.061827  0.025930  0.067844   
A10           NaN       NaN       NaN  ...  0.051922  0.018882  0.059247   
A11           NaN       NaN       NaN  ...  0.032120  0.000298  0.033808   
A12           NaN       NaN       NaN  ...  0.016165  0.016257  0.007703   
A13           NaN       NaN       NaN  ...  0.026340  0.005352  0.030680   
A14           NaN       NaN       NaN  ...  0.018876  0.004816  0.010160   
A15           NaN       NaN       NaN  ...  0.096043  0.055130  0.105659   
A16           NaN       NaN       NaN  ...  0.008604  0.015718  0.008609   
A17           NaN       NaN       NaN  ...  0.038486  0.014277  0.042907   
A18           NaN       NaN       NaN  ...  0.005322  0.009657  0.003884   
A19           NaN       NaN       NaN  ...  0.010323  0.019085  0.010821   
A20           NaN       NaN       NaN  ...  0.021079  0.030607  0.024445   
A21           NaN       NaN       NaN  ...  0.390178  0.304997  0.311499   
A22           NaN       NaN       NaN  ...       NaN  0.697363  0.878205   
A23           NaN       NaN       NaN  ...       NaN       NaN  0.711752   
A24           NaN       NaN       NaN  ...       NaN       NaN       NaN   
A25           NaN       NaN       NaN  ...       NaN       NaN       NaN   
A26           NaN       NaN       NaN  ...       NaN       NaN       NaN   
A27           NaN       NaN       NaN  ...       NaN       NaN       NaN   
A28           NaN       NaN       NaN  ...       NaN       NaN       NaN   
A29           NaN       NaN       NaN  ...       NaN       NaN       NaN   
A30           NaN       NaN       NaN  ...       NaN       NaN       NaN   
default       NaN       NaN       NaN  ...       NaN       NaN       NaN   

              A25       A26       A27       A28       A29       A30   default  
A1       0.029595  0.058243  0.024339  0.016082  0.030384  0.037751  0.379729  
A2       0.038940  0.040062  0.027123  0.019335  0.034419  0.003545  0.348172  
A3       0.004648  0.089724  0.022616  0.043113  0.035448  0.045847  0.284188  
A4       0.027313  0.052726  0.049662  0.001794  0.008777  0.039765  0.205009  
A5       0.013385  0.048345  0.031210  0.017612  0.014047  0.016582  0.307176  
A6       0.015482  0.051419  0.039601  0.015154  0.006894  0.025539  0.299989  
A7       0.017569  0.023242  0.040557  0.006748  0.004157  0.007308  0.263578  
A8       0.002585  0.059962  0.006095  0.057320  0.056247  0.021121  0.253846  
A9       0.018899  0.065997  0.005767  0.033927  0.039926  0.026991  0.218402  
A10      0.037150  0.062349  0.008349  0.041617  0.044128  0.016335  0.202581  
A11      0.023282  0.046768  0.000304  0.038167  0.046189  0.000764  0.265237  
A12      0.007964  0.051893  0.016695  0.025171  0.039909  0.023357  0.217501  
A13      0.010827  0.042673  0.027089  0.017296  0.012221  0.007792  0.241598  
A14      0.029538  0.030201  0.039988  0.005655  0.015422  0.017979  0.215327  
A15      0.005974  0.036478  0.038082  0.001126  0.013473  0.020310  0.183112  
A16      0.003082  0.044096  0.002541  0.034381  0.022561  0.020068  0.213077  
A17      0.021043  0.052722  0.010839  0.027533  0.028797  0.031417  0.160485  
A18      0.002156  0.037571  0.000865  0.028873  0.016575  0.017670  0.202788  
A19      0.005772  0.043228  0.006263  0.033676  0.029373  0.021555  0.161131  
A20      0.008235  0.054025  0.009739  0.036133  0.031360  0.024343  0.169789  
A21      0.064934  0.101947  0.131065  0.129005  0.108948  0.019865  0.002147  
A22      0.644130  0.239890  0.297063  0.293955  0.208576  0.091805  0.015739  
A23      0.530874  0.516247  0.038863  0.247596  0.166237  0.248887  0.000814  
A24      0.629945  0.277336  0.326940  0.329126  0.217147  0.135571  0.018183  
A25           NaN  0.082840  0.076721  0.086608  0.061642  0.029160  0.000441  
A26           NaN       NaN  0.076863  0.408254  0.261237  0.661799  0.034026  
A27           NaN       NaN       NaN  0.484557  0.340714  0.071456  0.004372  
A28           NaN       NaN       NaN       NaN  0.341482  0.127322  0.004713  
A29           NaN       NaN       NaN       NaN       NaN  0.079655  0.022300  
A30           NaN       NaN       NaN       NaN       NaN       NaN  0.017074  
default       NaN       NaN       NaN       NaN       NaN       NaN       NaN  

[31 rows x 31 columns]
/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  """Entry point for launching an IPython kernel.
In [17]:
#Looking for highly correlated variables

to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.75)]
print(); 
print(to_drop)
['A2', 'A6', 'A10', 'A11', 'A12', 'A18', 'A19', 'A24']
In [18]:
# So these features are highly correlated features that needs to be deleted
In [19]:
# Analysing dependent variable
In [20]:
#Plotting the disctribution of dependent variable on traininng_set
# increase font size of all elements on the graph using font_scale

sns.set(font_scale=3)

plt.subplots(figsize=(20,20))
sns.countplot(x='default',data=loan_data_train)
Out[20]:
<AxesSubplot:xlabel='default', ylabel='count'>
In [21]:
# So we can see that we have unbalanced datset
In [22]:
#The percentage of default loans in the training dataset is 

sum(loan_data_train['default'])/11500
Out[22]:
0.13043478260869565

Only 13% of the train dataset are default loans. So the model trained on this dataset will more likely predict non-default loans

In [29]:
#reduced train_dataset (excluding highly correlated variables)

loan_data_train_reduced=loan_data_train.drop(columns=to_drop)
In [31]:
loan_data_train_reduced.tail()
Out[31]:
A1 A3 A4 A5 A7 A8 A9 A13 A14 A15 ... A21 A22 A23 A25 A26 A27 A28 A29 A30 default
11495 14.55 8492.47 28646.00 46.15 0.40 90.21 1.615 32.12 64.50 199126.65 ... 15 0.00 0.00 0.04 0.88 72.39 18.19 10.84 0.00 0
11496 17.64 6303.06 8729.09 49.82 2.13 107.69 1.667 43.47 70.04 62373.21 ... 64 0.11 0.01 0.13 269.84 449.14 2.18 22.84 21.80 0
11497 28.31 5713.82 10946.15 38.15 3.29 102.87 1.250 25.67 69.23 160962.23 ... 27 0.03 0.00 0.10 0.00 201.04 12.08 6.98 0.00 0
11498 16.54 3028.71 4775.00 27.38 3.47 75.11 0.800 21.60 60.20 74871.60 ... 7 0.00 0.00 0.06 237.75 0.00 10.56 7.62 9.38 0
11499 19.92 3105.63 6366.67 44.44 1.21 150.70 1.000 34.00 55.64 29787.82 ... 47 0.06 0.00 0.11 72.11 841.82 104.60 98.00 0.00 0

5 rows × 23 columns

In [32]:
# Now lets do the same analysis for test set

Test set analysis

In [ ]:
 
In [33]:
loan_data_test=pd.read_csv('/Users/vladimirant/Desktop/Walmart Data Science Interview/Data Science HW design val.csv')
In [34]:
loan_data_test.tail()
Out[34]:
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 ... A22 A23 A24 A25 A26 A27 A28 A29 A30 default
11495 14.55 88.05 8492.47 28646.00 46.15 49.21 0.40 90.21 1.615 1.231 ... 0.00 0.00 0.00 0.04 0.88 72.39 18.19 10.84 0.00 0
11496 17.64 79.79 6303.06 8729.09 49.82 50.06 2.13 107.69 1.667 1.500 ... 0.11 0.01 0.01 0.13 269.84 449.14 2.18 22.84 21.80 0
11497 28.31 74.50 5713.82 10946.15 38.15 33.27 3.29 102.87 1.250 1.000 ... 0.03 0.00 0.00 0.10 0.00 201.04 12.08 6.98 0.00 0
11498 16.54 77.79 3028.71 4775.00 27.38 39.43 3.47 75.11 0.800 0.800 ... 0.00 0.00 0.00 0.06 237.75 0.00 10.56 7.62 9.38 0
11499 19.92 83.00 3105.63 6366.67 44.44 66.13 1.21 150.70 1.000 1.000 ... 0.06 0.00 0.02 0.11 72.11 841.82 104.60 98.00 0.00 0

5 rows × 31 columns

In [36]:
#Checking for missing values

pd.options.display.max_rows = None
# Sets the pandas dataframe options to display all columns/ rows.
loan_data_test.isnull().sum()
Out[36]:
A1         0
A2         0
A3         0
A4         0
A5         0
A6         0
A7         0
A8         0
A9         0
A10        0
A11        0
A12        0
A13        0
A14        0
A15        0
A16        0
A17        0
A18        0
A19        0
A20        0
A21        0
A22        0
A23        0
A24        0
A25        0
A26        0
A27        0
A28        0
A29        0
A30        0
default    0
dtype: int64
In [37]:
#Distribution of target varable in the test set (same distribution)
#increase font size of all elements
sns.set(font_scale=3)

plt.subplots(figsize=(20,20))
sns.countplot(x='default',data=loan_data_test)
Out[37]:
<AxesSubplot:xlabel='default', ylabel='count'>
In [39]:
#We have the same proportion of defaults in test dataset
sum(loan_data_test['default'])/11500
Out[39]:
0.13043478260869565
In [40]:
loan_data_test_reduced=loan_data_test.drop(columns=to_drop)

Preparing dataset for the logistic regression and ML model

In [41]:
# Creating inputs for future models
In [42]:
X_train, Y_train=loan_data_train_reduced[['A1', 'A3', 'A4', 'A5', 'A7', 'A8', 'A9', 'A13', 'A14', 'A15', 'A16',
       'A17', 'A20', 'A21', 'A22', 'A23', 'A25', 'A26', 'A27', 'A28', 'A29',
       'A30']],loan_data_test_reduced['default']
In [43]:
len(['A1', 'A3', 'A4', 'A5', 'A7', 'A8', 'A9', 'A13', 'A14', 'A15', 'A16',
       'A17', 'A20', 'A21', 'A22', 'A23', 'A25', 'A26', 'A27', 'A28', 'A29',
       'A30'])
Out[43]:
22
In [44]:
#creating X and Y for a test set (reduced test_set)
X_test, Y_test=loan_data_test_reduced[['A1', 'A3', 'A4', 'A5', 'A7', 'A8', 'A9', 'A13', 'A14', 'A15', 'A16',
       'A17', 'A20', 'A21', 'A22', 'A23', 'A25', 'A26', 'A27', 'A28', 'A29',
       'A30']],loan_data_test_reduced['default']

Building Logistic Regression Model

In [45]:
#Fitting the model on the train set

import statsmodels.api as sm
logit_model=sm.Logit(Y_train,X_train)
result=logit_model.fit()
print(result.summary2())
Optimization terminated successfully.
         Current function value: 0.303732
         Iterations 7
                         Results: Logit
================================================================
Model:              Logit            Pseudo R-squared: 0.216    
Dependent Variable: default          AIC:              7029.8372
Date:               2022-04-08 00:41 BIC:              7191.5395
No. Observations:   11500            Log-Likelihood:   -3492.9  
Df Model:           21               LL-Null:          -4452.9  
Df Residuals:       11478            LLR p-value:      0.0000   
Converged:          1.0000           Scale:            1.0000   
No. Iterations:     7.0000                                      
------------------------------------------------------------------
         Coef.    Std.Err.      z       P>|z|     [0.025    0.975]
------------------------------------------------------------------
A1       0.0926     0.0055    16.8465   0.0000    0.0818    0.1034
A3      -0.0001     0.0000    -5.0179   0.0000   -0.0002   -0.0001
A4      -0.0000     0.0000    -0.3913   0.6956   -0.0000    0.0000
A5       0.0203     0.0043     4.6920   0.0000    0.0118    0.0288
A7       0.1121     0.0334     3.3604   0.0008    0.0467    0.1775
A8      -0.0141     0.0012   -11.3264   0.0000   -0.0166   -0.0117
A9      -0.1411     0.0590    -2.3923   0.0167   -0.2567   -0.0255
A13     -0.0025     0.0032    -0.7662   0.4436   -0.0088    0.0039
A14     -0.0175     0.0038    -4.6238   0.0000   -0.0250   -0.0101
A15     -0.0000     0.0000    -1.2542   0.2098   -0.0000    0.0000
A16      0.4280     0.1078     3.9695   0.0001    0.2167    0.6394
A17     -0.0001     0.0000    -2.8156   0.0049   -0.0001   -0.0000
A20     -0.2321     0.0538    -4.3123   0.0000   -0.3376   -0.1266
A21      0.0001     0.0003     0.2239   0.8228   -0.0006    0.0008
A22     -0.0702     0.3412    -0.2057   0.8370   -0.7389    0.5985
A23      0.8925     2.2677     0.3936   0.6939   -3.5520    5.3371
A25     -0.0801     0.2862    -0.2798   0.7796   -0.6411    0.4809
A26     -0.0002     0.0001    -1.4716   0.1411   -0.0004    0.0001
A27     -0.0001     0.0001    -1.5497   0.1212   -0.0003    0.0000
A28      0.0009     0.0003     2.8343   0.0046    0.0003    0.0015
A29      0.0001     0.0007     0.1464   0.8836   -0.0012    0.0014
A30     -0.0003     0.0005    -0.5465   0.5847   -0.0013    0.0007
================================================================

In [46]:
# So we need to exclude A4, A13, A15, A21, A23, A25, A26, A27, A29, A30
# because of high p-value (>0.05)
In [47]:
#creating new x-vector
X_train_2, Y_train=loan_data_train_reduced[['A1', 'A3', 'A5', 'A7', 'A8', 'A9', 'A14', 'A16',
       'A17', 'A20','A28']],loan_data_test_reduced['default']
In [48]:
#building the model with new x-vector

import statsmodels.api as sm
logit_model_2=sm.Logit(Y_train,X_train_2)
result=logit_model_2.fit()
print(result.summary2())
Optimization terminated successfully.
         Current function value: 0.304115
         Iterations 7
                         Results: Logit
================================================================
Model:              Logit            Pseudo R-squared: 0.215    
Dependent Variable: default          AIC:              7016.6349
Date:               2022-04-08 00:41 BIC:              7097.4860
No. Observations:   11500            Log-Likelihood:   -3497.3  
Df Model:           10               LL-Null:          -4452.9  
Df Residuals:       11489            LLR p-value:      0.0000   
Converged:          1.0000           Scale:            1.0000   
No. Iterations:     7.0000                                      
------------------------------------------------------------------
         Coef.    Std.Err.      z       P>|z|     [0.025    0.975]
------------------------------------------------------------------
A1       0.0941     0.0054    17.4267   0.0000    0.0835    0.1047
A3      -0.0001     0.0000    -6.4865   0.0000   -0.0002   -0.0001
A5       0.0194     0.0036     5.3344   0.0000    0.0123    0.0265
A7       0.1192     0.0324     3.6838   0.0002    0.0558    0.1827
A8      -0.0140     0.0012   -11.4925   0.0000   -0.0164   -0.0116
A9      -0.1684     0.0561    -3.0026   0.0027   -0.2783   -0.0585
A14     -0.0194     0.0035    -5.5115   0.0000   -0.0262   -0.0125
A16      0.4143     0.1073     3.8621   0.0001    0.2040    0.6245
A17     -0.0001     0.0000    -3.1273   0.0018   -0.0001   -0.0000
A20     -0.2461     0.0529    -4.6537   0.0000   -0.3498   -0.1425
A28      0.0004     0.0002     1.7620   0.0781   -0.0000    0.0009
================================================================

In [49]:
#Excluding A28 due high p-value
X_train_3, Y_train=loan_data_train_reduced[['A1', 'A3', 'A5', 'A7', 'A8', 'A9', 'A14', 'A16',
       'A17', 'A20']],loan_data_test_reduced['default']
In [50]:
# building log reg model with new vector X 
#Now we can see that allthe coefficients are significant

import statsmodels.api as sm
logit_model_3=sm.Logit(Y_train,X_train_3)
result=logit_model_3.fit()
print(result.summary2())
Optimization terminated successfully.
         Current function value: 0.304246
         Iterations 7
                         Results: Logit
================================================================
Model:              Logit            Pseudo R-squared: 0.214    
Dependent Variable: default          AIC:              7017.6501
Date:               2022-04-08 00:41 BIC:              7091.1511
No. Observations:   11500            Log-Likelihood:   -3498.8  
Df Model:           9                LL-Null:          -4452.9  
Df Residuals:       11490            LLR p-value:      0.0000   
Converged:          1.0000           Scale:            1.0000   
No. Iterations:     7.0000                                      
------------------------------------------------------------------
         Coef.    Std.Err.      z       P>|z|     [0.025    0.975]
------------------------------------------------------------------
A1       0.0940     0.0054    17.4161   0.0000    0.0835    0.1046
A3      -0.0001     0.0000    -6.4223   0.0000   -0.0002   -0.0001
A5       0.0194     0.0036     5.3323   0.0000    0.0123    0.0265
A7       0.1224     0.0323     3.7862   0.0002    0.0590    0.1858
A8      -0.0139     0.0012   -11.4270   0.0000   -0.0162   -0.0115
A9      -0.1668     0.0560    -2.9780   0.0029   -0.2766   -0.0570
A14     -0.0192     0.0035    -5.4660   0.0000   -0.0260   -0.0123
A16      0.4053     0.1071     3.7834   0.0002    0.1953    0.6152
A17     -0.0001     0.0000    -3.0967   0.0020   -0.0001   -0.0000
A20     -0.2445     0.0529    -4.6245   0.0000   -0.3481   -0.1409
================================================================

In [51]:
#building "final' logistic regression 
#fitting the model on the last training set 

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression()
logreg.fit(X_train_3, Y_train)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:818: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG,
Out[51]:
LogisticRegression()
In [53]:
#predicting using the model on the train dataset

# calcaulating the accuracy on the training set

y_pred = logreg.predict(X_train_3)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_train_3, Y_train)))
Accuracy of logistic regression classifier on test set: 0.88
In [54]:
#confusion matrix on the train dataset

from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_train, y_pred)
print(confusion_matrix)
[[9795  205]
 [1149  351]]
In [55]:
#confusion matrix for the train set

from sklearn.metrics import classification_report
print(classification_report(Y_train, y_pred))
              precision    recall  f1-score   support

           0       0.90      0.98      0.94     10000
           1       0.63      0.23      0.34      1500

    accuracy                           0.88     11500
   macro avg       0.76      0.61      0.64     11500
weighted avg       0.86      0.88      0.86     11500

In [56]:
# we can clearly see that the model built on original dataset
# have troubles with predicting class 1:
# all metrics of confusion matrix are lower for class 1 than for class 0

Calculating AUC

In [57]:
#AUC for train_set (Logistic Reg)


from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(Y_train, logreg.predict(X_train_3))
fpr, tpr, thresholds = roc_curve(Y_train, logreg.predict_proba(X_train_3)[:,1])
plt.figure(figsize=(20,20))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
In [58]:
#Let's check AUC for a test_set
In [59]:
X_test_3=loan_data_test_reduced[['A1', 'A3', 'A5', 'A7', 'A8', 'A9', 'A14', 'A16',
       'A17', 'A20']]
In [60]:
y_pred_test = logreg.predict(X_test_3)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test_3, Y_test)))
Accuracy of logistic regression classifier on test set: 0.88
In [61]:
logit_roc_auc = roc_auc_score(Y_test, logreg.predict(X_test_3))
fpr, tpr, thresholds = roc_curve(Y_test, logreg.predict_proba(X_test_3)[:,1])
plt.figure(figsize=(20,20))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

So we can see that AUC for both train and test set is 0.61

Thats not great result, but more than 0.5 anyway

In [ ]:
 
In [62]:
#now I am trying to run logistic regression on scaled data
In [63]:
#Scaling the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_sc=scaler.fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)
In [64]:
logit_model_scaled=sm.Logit(Y_train,X_train_scaled)
result=logit_model_scaled.fit()
print(result.summary2())
Optimization terminated successfully.
         Current function value: 0.300204
         Iterations 8
                         Results: Logit
================================================================
Model:              Logit            Pseudo R-squared: 0.225    
Dependent Variable: default          AIC:              6948.6858
Date:               2022-04-08 00:46 BIC:              7110.3881
No. Observations:   11500            Log-Likelihood:   -3452.3  
Df Model:           21               LL-Null:          -4452.9  
Df Residuals:       11478            LLR p-value:      0.0000   
Converged:          1.0000           Scale:            1.0000   
No. Iterations:     8.0000                                      
------------------------------------------------------------------
        Coef.     Std.Err.      z      P>|z|     [0.025     0.975]
------------------------------------------------------------------
x1       9.8209     0.5576   17.6114   0.0000     8.7279   10.9138
x2      -1.8536     0.6458   -2.8702   0.0041    -3.1193   -0.5878
x3      -0.1114     1.1745   -0.0949   0.9244    -2.4135    2.1906
x4       2.7424     0.5582    4.9132   0.0000     1.6484    3.8364
x5       1.4153     0.3733    3.7915   0.0001     0.6837    2.1469
x6      -3.0552     0.4308   -7.0911   0.0000    -3.8996   -2.2107
x7      -1.2633     0.5996   -2.1067   0.0351    -2.4386   -0.0880
x8      -0.2658     0.5207   -0.5105   0.6097    -1.2863    0.7547
x9      -0.3521     0.5258   -0.6696   0.5031    -1.3827    0.6785
x10     -2.1211     0.6590   -3.2184   0.0013    -3.4128   -0.8294
x11      1.7600     0.4324    4.0704   0.0000     0.9125    2.6075
x12     -0.3361     0.5005   -0.6716   0.5018    -1.3170    0.6448
x13     -0.5490     0.3533   -1.5538   0.1202    -1.2414    0.1435
x14      2.4769     3.6679    0.6753   0.4995    -4.7120    9.6658
x15      0.0709     1.7153    0.0413   0.9670    -3.2910    3.4328
x16      0.4548     1.5621    0.2911   0.7709    -2.6069    3.5165
x17     -0.0329     0.6701   -0.0490   0.9609    -1.3462    1.2805
x18      0.0337     3.3085    0.0102   0.9919    -6.4509    6.5184
x19      0.0416     0.7064    0.0589   0.9531    -1.3428    1.4260
x20      1.5343     0.5983    2.5643   0.0103     0.3616    2.7071
x21     -8.3918     1.0740   -7.8133   0.0000   -10.4969   -6.2867
x22    -13.1798     7.7551   -1.6995   0.0892   -28.3796    2.0200
================================================================

In [65]:
#Now let's try it for final set of variables choosen by initially selected variables
In [66]:
# creating scaling inputs
X_train3_sc=scaler.fit(X_train_3)
X_train_3_scaled=scaler.transform(X_train_3)
X_test_3_scaled=scaler.transform(X_test_3)
In [67]:
# Log Reg on scaled data
logreg_scaled = LogisticRegression()
logreg_scaled.fit(X_train_3_scaled, Y_train)
Out[67]:
LogisticRegression()
In [68]:
#building logistic regression on scaled data
logit_model_scaled_3=sm.Logit(Y_train,X_train_3_scaled)
result=logit_model_scaled_3.fit()
print(result.summary2())
Optimization terminated successfully.
         Current function value: 0.304246
         Iterations 7
                         Results: Logit
================================================================
Model:              Logit            Pseudo R-squared: 0.214    
Dependent Variable: default          AIC:              7017.6501
Date:               2022-04-08 00:46 BIC:              7091.1511
No. Observations:   11500            Log-Likelihood:   -3498.8  
Df Model:           9                LL-Null:          -4452.9  
Df Residuals:       11490            LLR p-value:      0.0000   
Converged:          1.0000           Scale:            1.0000   
No. Iterations:     7.0000                                      
------------------------------------------------------------------
         Coef.    Std.Err.      z       P>|z|     [0.025    0.975]
------------------------------------------------------------------
x1       9.4047     0.5400    17.4161   0.0000    8.3463   10.4630
x2      -3.4842     0.5425    -6.4223   0.0000   -4.5475   -2.4209
x3       2.4799     0.4651     5.3323   0.0000    1.5684    3.3914
x4       1.3562     0.3582     3.7862   0.0002    0.6541    2.0582
x5      -4.4309     0.3878   -11.4270   0.0000   -5.1909   -3.6709
x6      -1.6679     0.5601    -2.9780   0.0029   -2.7655   -0.5702
x7      -2.4084     0.4406    -5.4660   0.0000   -3.2720   -1.5448
x8       1.6211     0.4285     3.7834   0.0002    0.7813    2.4609
x9      -1.4707     0.4749    -3.0967   0.0020   -2.4016   -0.5399
x10     -1.5331     0.3315    -4.6245   0.0000   -2.1829   -0.8834
================================================================

In [69]:
#AUC on train set for scaled data

logit_roc_auc = roc_auc_score(Y_train, logreg_scaled.predict(X_train_3_scaled))
fpr, tpr, thresholds = roc_curve(Y_train, logreg_scaled.predict_proba(X_train_3_scaled)[:,1])
plt.figure(figsize=(20,20))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
In [70]:
#AUC on test set for scaled data

logit_roc_auc = roc_auc_score(Y_test, logreg_scaled.predict(X_test_3_scaled))
fpr, tpr, thresholds = roc_curve(Y_test, logreg_scaled.predict_proba(X_test_3_scaled)[:,1])
plt.figure(figsize=(20,20))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
In [71]:
# Using scaling data did not help to improve the model
In [ ]:
 

Undersampling

In [72]:
# The train set looks unbalanced, thus i am trying to use more balanced dataset
# I am reducing the trainset so it has equal proportion of both clasess
In [73]:
# train_set positives 
train_set_positives=loan_data_train.loc[loan_data_train['default'] == 1]
train_set_positives.tail()
Out[73]:
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 ... A22 A23 A24 A25 A26 A27 A28 A29 A30 default
1495 4.50 96.21 6872.92 10583.50 25.08 21.83 0.86 133.77 1.714 2.000 ... 0.04 0.01 0.01 0.03 2110.11 0.00 223.98 167.46 96.93 1
1496 14.13 57.75 3087.25 7975.00 73.75 79.50 3.42 88.33 0.250 0.250 ... 0.04 0.00 0.00 0.03 216.27 310.13 158.68 16.86 0.00 1
1497 22.61 76.70 4759.71 10300.13 49.27 57.36 2.72 80.50 1.000 0.818 ... 0.05 0.00 0.02 0.06 36.59 2116.43 787.84 41.35 0.00 1
1498 15.00 50.50 4880.11 5508.89 53.67 60.89 5.17 84.56 1.500 1.500 ... 0.03 0.00 0.01 0.13 0.00 309.61 15.86 52.18 0.00 1
1499 21.18 74.21 3212.50 4930.77 52.31 51.42 1.58 85.75 1.286 1.286 ... 0.03 0.00 0.00 0.11 0.00 442.57 153.04 29.82 0.00 1

5 rows × 31 columns

In [74]:
# train_set negatives 
train_set_negatives=loan_data_train.loc[loan_data_train['default'] == 0]
train_set_negatives.tail()
Out[74]:
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 ... A22 A23 A24 A25 A26 A27 A28 A29 A30 default
11495 14.55 88.05 8492.47 28646.00 46.15 49.21 0.40 90.21 1.615 1.231 ... 0.00 0.00 0.00 0.04 0.88 72.39 18.19 10.84 0.00 0
11496 17.64 79.79 6303.06 8729.09 49.82 50.06 2.13 107.69 1.667 1.500 ... 0.11 0.01 0.01 0.13 269.84 449.14 2.18 22.84 21.80 0
11497 28.31 74.50 5713.82 10946.15 38.15 33.27 3.29 102.87 1.250 1.000 ... 0.03 0.00 0.00 0.10 0.00 201.04 12.08 6.98 0.00 0
11498 16.54 77.79 3028.71 4775.00 27.38 39.43 3.47 75.11 0.800 0.800 ... 0.00 0.00 0.00 0.06 237.75 0.00 10.56 7.62 9.38 0
11499 19.92 83.00 3105.63 6366.67 44.44 66.13 1.21 150.70 1.000 1.000 ... 0.06 0.00 0.02 0.11 72.11 841.82 104.60 98.00 0.00 0

5 rows × 31 columns

In [75]:
#sample of negatives

train_set_negatives_1500=train_set_negatives.sample(n=1500, random_state=3)
In [76]:
#creating balanced training set

train_set_US=pd.concat([train_set_negatives_1500, train_set_positives], ignore_index=True)
train_set_US.describe()
Out[76]:
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 ... A22 A23 A24 A25 A26 A27 A28 A29 A30 default
count 3000.000000 3000.000000 3000.000000 3000.000000 3000.000000 3000.000000 3000.000000 3000.000000 3000.000000 3000.00000 ... 3000.000000 3000.000000 3000.000000 3000.000000 3000.000000 3000.000000 3000.000000 3000.000000 3000.000000 3000.000000
mean 15.858773 79.569383 6255.497407 11078.600383 38.938407 43.380670 1.911893 112.063447 1.460360 1.22596 ... 0.081987 0.009357 0.015053 0.136250 264.425663 354.423387 101.389723 25.652367 37.448543 0.500000
std 8.218178 10.336743 2699.340183 7573.451428 14.628862 17.541191 1.157369 35.984742 0.762602 0.63808 ... 0.164058 0.025134 0.031711 0.152962 425.078884 584.169704 135.791234 50.017836 79.748017 0.500083
min 0.000000 16.710000 0.000000 182.500000 0.670000 0.000000 0.000000 0.000000 0.000000 0.00000 ... 0.000000 0.000000 0.000000 0.000000 -157.850000 -140.940000 -23.870000 -172.690000 -168.000000 0.000000
25% 10.290000 73.330000 4291.375000 6400.000000 29.872500 32.030000 1.130000 87.030000 1.000000 0.92675 ... 0.020000 0.000000 0.000000 0.060000 0.000000 0.000000 13.730000 0.000000 0.000000 0.000000
50% 15.260000 80.455000 6168.120000 9750.370000 36.750000 40.985000 1.800000 113.255000 1.465500 1.25000 ... 0.040000 0.000000 0.010000 0.090000 102.305000 122.675000 56.330000 9.000000 0.000000 0.500000
75% 21.280000 86.780000 8015.437500 13565.960000 47.500000 53.505000 2.500000 135.832500 1.923000 1.60000 ... 0.090000 0.010000 0.020000 0.140000 330.835000 482.912500 134.775000 31.562500 45.635000 1.000000
max 100.000000 100.000000 19067.500000 132787.500000 128.000000 128.000000 11.080000 319.670000 6.000000 6.00000 ... 4.920000 0.680000 0.770000 1.460000 4137.400000 9094.610000 1552.350000 1195.000000 1077.500000 1.000000

8 rows × 31 columns

In [78]:
train_set_US.tail()
Out[78]:
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 ... A22 A23 A24 A25 A26 A27 A28 A29 A30 default
2995 4.50 96.21 6872.92 10583.50 25.08 21.83 0.86 133.77 1.714 2.000 ... 0.04 0.01 0.01 0.03 2110.11 0.00 223.98 167.46 96.93 1
2996 14.13 57.75 3087.25 7975.00 73.75 79.50 3.42 88.33 0.250 0.250 ... 0.04 0.00 0.00 0.03 216.27 310.13 158.68 16.86 0.00 1
2997 22.61 76.70 4759.71 10300.13 49.27 57.36 2.72 80.50 1.000 0.818 ... 0.05 0.00 0.02 0.06 36.59 2116.43 787.84 41.35 0.00 1
2998 15.00 50.50 4880.11 5508.89 53.67 60.89 5.17 84.56 1.500 1.500 ... 0.03 0.00 0.01 0.13 0.00 309.61 15.86 52.18 0.00 1
2999 21.18 74.21 3212.50 4930.77 52.31 51.42 1.58 85.75 1.286 1.286 ... 0.03 0.00 0.00 0.11 0.00 442.57 153.04 29.82 0.00 1

5 rows × 31 columns

In [79]:
# building inputs from new training set for the same logistic model
In [139]:
#Plotting heatmap/correlation table

plt.subplots(figsize=(70,70))
corr2 = train_set_US.corr()
sb.heatmap(corr2, annot=True,)
Out[139]:
<AxesSubplot:>
In [81]:
#finding highly correlated features to drop

cor_matrix2 = train_set_US.corr().abs()
upper_tri2 = cor_matrix2.where(np.triu(np.ones(cor_matrix2.shape),k=1).astype(np.bool))
to_drop_US = [column for column in upper_tri2.columns if any(upper_tri2[column] > 0.75)]
to_drop_US
/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:4: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  after removing the cwd from sys.path.
Out[81]:
['A6', 'A10', 'A11', 'A12', 'A18', 'A19', 'A24']
In [82]:
#Creting reduced train set for Und Samp
train_set_US_reduced=train_set_US.drop(columns=to_drop_US)
In [83]:
#Crearing train inputs for US data 
X_train_US=train_set_US_reduced.loc[:,train_set_US_reduced.columns!='default']
Y_train_US=train_set_US_reduced['default']
In [84]:
#Log reg for US 
In [85]:
# building log reg model with new vector X for US


logit_model_US=sm.Logit(Y_train_US,X_train_US)
result=logit_model_US.fit()
print(result.summary2())
Optimization terminated successfully.
         Current function value: 0.522151
         Iterations 7
                          Results: Logit
==================================================================
Model:              Logit            Pseudo R-squared: 0.247      
Dependent Variable: default          AIC:              3178.9050  
Date:               2022-04-08 00:49 BIC:              3317.0515  
No. Observations:   3000             Log-Likelihood:   -1566.5    
Df Model:           22               LL-Null:          -2079.4    
Df Residuals:       2977             LLR p-value:      5.7749e-203
Converged:          1.0000           Scale:            1.0000     
No. Iterations:     7.0000                                        
---------------------------------------------------------------------
        Coef.     Std.Err.       z       P>|z|      [0.025     0.975]
---------------------------------------------------------------------
A1      0.0862      0.0081    10.6545    0.0000     0.0703     0.1020
A2     -0.0177      0.0040    -4.4654    0.0000    -0.0255    -0.0100
A3     -0.0000      0.0000    -1.3396    0.1804    -0.0001     0.0000
A4     -0.0000      0.0000    -0.0442    0.9648    -0.0000     0.0000
A5      0.0223      0.0063     3.5383    0.0004     0.0099     0.0346
A7      0.1009      0.0511     1.9726    0.0485     0.0006     0.2011
A8     -0.0062      0.0018    -3.4687    0.0005    -0.0098    -0.0027
A9     -0.1012      0.0803    -1.2599    0.2077    -0.2585     0.0562
A13    -0.0061      0.0047    -1.3068    0.1913    -0.0153     0.0031
A14     0.0040      0.0057     0.6976    0.4854    -0.0073     0.0153
A15    -0.0000      0.0000    -2.9162    0.0035    -0.0000    -0.0000
A16     0.3812      0.1643     2.3197    0.0204     0.0591     0.7033
A17     0.0000      0.0000     1.2405    0.2148    -0.0000     0.0001
A20     0.0266      0.0833     0.3192    0.7495    -0.1368     0.1900
A21     0.0000      0.0003     0.0147    0.9883    -0.0006     0.0006
A22     0.2009      0.5577     0.3603    0.7186    -0.8922     1.2941
A23    -2.1891      3.2425    -0.6751    0.4996    -8.5443     4.1662
A25     0.1175      0.4077     0.2881    0.7733    -0.6816     0.9165
A26    -0.0000      0.0002    -0.1355    0.8922    -0.0003     0.0003
A27    -0.0001      0.0001    -1.3079    0.1909    -0.0003     0.0001
A28     0.0007      0.0004     1.5234    0.1276    -0.0002     0.0015
A29     0.0004      0.0010     0.4248    0.6710    -0.0016     0.0024
A30    -0.0002      0.0007    -0.3017    0.7628    -0.0015     0.0011
==================================================================

In [86]:
#dropping variables based on p-values
In [87]:
X_train_US_2=X_train_US[['A1','A3','A5','A7','A8', 'A15','A16']]
In [89]:
# building log reg model with new vector X 


logit_model_US_2=sm.Logit(Y_train_US,X_train_US_2)
result=logit_model_US_2.fit()
print(result.summary2())
Optimization terminated successfully.
         Current function value: 0.527557
         Iterations 6
                          Results: Logit
==================================================================
Model:              Logit            Pseudo R-squared: 0.239      
Dependent Variable: default          AIC:              3179.3435  
Date:               2022-04-08 00:49 BIC:              3221.3880  
No. Observations:   3000             Log-Likelihood:   -1582.7    
Df Model:           6                LL-Null:          -2079.4    
Df Residuals:       2993             LLR p-value:      2.2317e-211
Converged:          1.0000           Scale:            1.0000     
No. Iterations:     6.0000                                        
---------------------------------------------------------------------
        Coef.     Std.Err.       z       P>|z|      [0.025     0.975]
---------------------------------------------------------------------
A1      0.0897      0.0075    11.9334    0.0000     0.0749     0.1044
A3     -0.0001      0.0000    -3.5288    0.0004    -0.0001    -0.0000
A5      0.0104      0.0033     3.1194    0.0018     0.0039     0.0170
A7      0.1021      0.0473     2.1572    0.0310     0.0093     0.1948
A8     -0.0101      0.0015    -6.8005    0.0000    -0.0130    -0.0072
A15    -0.0000      0.0000    -4.4367    0.0000    -0.0000    -0.0000
A16     0.3508      0.1603     2.1887    0.0286     0.0367     0.6649
==================================================================

In [90]:
#all variables are significant, lets run the model
In [91]:
# Log Reg on US data
logreg_US= LogisticRegression()
logreg_US.fit(X_train_US_2, Y_train_US)
Out[91]:
LogisticRegression()
In [92]:
#AUC for train_set (Logistic Reg US data)

logit_roc_auc = roc_auc_score(Y_train_US, logreg_US.predict(X_train_US_2))
fpr, tpr, thresholds = roc_curve(Y_train_US, logreg_US.predict_proba(X_train_US_2)[:,1])
plt.figure(figsize=(20,20))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
In [140]:
# As we can see Logistic regression on the balanced dataset improved and use fewer cariables
In [93]:
X_test_US=loan_data_test[['A1','A3','A5','A7','A8', 'A15','A16']]
In [94]:
#AUC for test_set (Logistic Reg US data)

logit_roc_auc = roc_auc_score(Y_test, logreg_US.predict(X_test_US))
fpr, tpr, thresholds = roc_curve(Y_test, logreg_US.predict_proba(X_test_US)[:,1])
plt.figure(figsize=(20,20))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

So training logistic regression on the smaller but balanced set

provides us with lighter model and higher AUC

on both training and test set

In [ ]:
 

Saving the results for LR Model with Undersampling

In [165]:
# Predicted probabilities to be classified as 1

LR_prob1_array=logreg_US.predict_proba(X_test_US)[:,1]
df_res1=pd.DataFrame(LR_prob1_array)
In [168]:
# saving the dataframe
df_res1.to_csv('results1.csv')
In [ ]:
 

ML Part

In [104]:
#Importing libraries
import sklearn
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
In [97]:
#Creating Random Forest Clasifier
model_RFC = RandomForestClassifier()
In [99]:
#Running cross validation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model_RFC, X_train, Y_train, scoring='roc_auc', cv=cv, n_jobs=-1, error_score='raise')
In [100]:
#Auc of the model on the train set (no US)

np.mean(n_scores)
Out[100]:
0.815399
In [101]:
#Fitting the model on the train set
model_RFC.fit(X_train, Y_train)
Out[101]:
RandomForestClassifier()
In [102]:
#Predicting model on the test_set
model_RFC.predict(X_test)
Out[102]:
array([1, 1, 1, ..., 0, 0, 0])
In [105]:
#measuring AUC on test_set for RFC|

sklearn.metrics.roc_auc_score(Y_test,model_RFC.predict(X_test))
Out[105]:
1.0
In [171]:
#creating array of output probability predictions to be classified as 1

RFC_array=model_RFC.predict_proba(X_test)
RFC_array_p1=RFC_array[:,1]
df_res2=pd.DataFrame()

# saving the dataframe
df_res2.to_csv('results2.csv')
In [172]:
#Checking the resulting array for ML model

RFC_array_p1
Out[172]:
array([0.75, 0.7 , 0.93, ..., 0.09, 0.05, 0.09])
In [ ]:
 

Conclusion

In [111]:
#Logistic regression trained on the balanced dataset provides better performance
#both on the train and test set with AUC equal to 0.74 
#in comparison with LR model trained on the whole dataset (AUC 0.61)
In [112]:
# Random forest classifier provides higher accuracy than Logistic Regression
# With perfect AUC on the test dataset
In [ ]:
 
In [ ]:
 

P.S.

In [173]:
# As bonus i decided to check LR with OVERSAMPLING
# Just for experimental purposes
# Normally for the logistic regression data should not come from repeated 
# measurements
In [157]:
# Now I want to try logistic regression with Over sampling
In [158]:
train_set_positives.tail()
Out[158]:
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 ... A22 A23 A24 A25 A26 A27 A28 A29 A30 default
1495 4.50 96.21 6872.92 10583.50 25.08 21.83 0.86 133.77 1.714 2.000 ... 0.04 0.01 0.01 0.03 2110.11 0.00 223.98 167.46 96.93 1
1496 14.13 57.75 3087.25 7975.00 73.75 79.50 3.42 88.33 0.250 0.250 ... 0.04 0.00 0.00 0.03 216.27 310.13 158.68 16.86 0.00 1
1497 22.61 76.70 4759.71 10300.13 49.27 57.36 2.72 80.50 1.000 0.818 ... 0.05 0.00 0.02 0.06 36.59 2116.43 787.84 41.35 0.00 1
1498 15.00 50.50 4880.11 5508.89 53.67 60.89 5.17 84.56 1.500 1.500 ... 0.03 0.00 0.01 0.13 0.00 309.61 15.86 52.18 0.00 1
1499 21.18 74.21 3212.50 4930.77 52.31 51.42 1.58 85.75 1.286 1.286 ... 0.03 0.00 0.00 0.11 0.00 442.57 153.04 29.82 0.00 1

5 rows × 31 columns

In [127]:
#Checking how can i call specific rows
train_set_positives.loc[[2,3,2]]
Out[127]:
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 ... A22 A23 A24 A25 A26 A27 A28 A29 A30 default
2 25.16 64.20 4758.44 7818.15 46.28 50.72 2.69 110.16 1.124 0.889 ... 0.08 0.02 0.02 0.10 622.70 755.52 102.89 6.24 0.0 1
3 19.50 77.81 5762.27 6290.00 58.14 76.27 2.44 119.92 1.222 1.000 ... 0.04 0.00 0.01 0.08 197.55 396.27 76.34 8.44 2.0 1
2 25.16 64.20 4758.44 7818.15 46.28 50.72 2.69 110.16 1.124 0.889 ... 0.08 0.02 0.02 0.10 622.70 755.52 102.89 6.24 0.0 1

3 rows × 31 columns

In [159]:
#Generating indexes for oversampling
a=range(1500)
In [160]:
#Generating indexes for oversampling
np.random.seed(0)
OS_indexes=np.random.choice(a,size=10000)
In [161]:
#Creating 10000 positves set
train_set_positives_OS=train_set_positives.loc[OS_indexes]
train_set_positives_OS.tail()
Out[161]:
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 ... A22 A23 A24 A25 A26 A27 A28 A29 A30 default
434 13.47 81.65 7321.74 16065.00 31.89 34.61 1.67 130.46 1.947 1.590 ... 0.18 0.01 0.05 0.18 235.12 2244.54 348.34 5.91 194.21 1
1019 13.91 80.32 7368.64 10437.87 34.28 38.18 1.83 115.46 1.462 1.325 ... 0.09 0.01 0.04 0.32 0.00 699.50 215.71 40.14 0.00 1
1097 24.69 68.93 12924.38 23100.00 31.90 39.50 2.87 140.58 2.429 1.857 ... 0.00 0.00 0.00 0.05 0.00 0.00 0.00 0.00 0.00 1
1323 19.13 82.00 8886.86 10887.50 24.63 27.86 1.10 106.13 1.667 2.000 ... 0.05 0.00 0.00 0.07 0.00 385.72 53.26 26.82 0.00 1
18 13.40 85.00 10750.00 8250.00 40.00 5.00 4.22 129.00 1.750 1.750 ... 0.03 0.00 0.01 0.05 113.81 0.00 368.72 51.00 0.00 1

5 rows × 31 columns

In [162]:
#creating balanced training set Oversampling

train_set_OS=pd.concat([train_set_negatives, train_set_positives_OS], ignore_index=True)
train_set_OS.describe()
Out[162]:
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 ... A22 A23 A24 A25 A26 A27 A28 A29 A30 default
count 20000.000000 20000.000000 20000.000000 20000.000000 20000.000000 20000.000000 20000.000000 20000.000000 20000.000000 20000.000000 ... 20000.000000 20000.000000 20000.000000 20000.000000 20000.000000 20000.000000 20000.000000 20000.000000 20000.000000 20000.000000
mean 15.862498 79.491978 6269.318080 11143.956735 38.719786 43.159468 1.916993 111.894729 1.461187 1.226283 ... 0.080369 0.009093 0.014767 0.135485 267.011296 359.430269 100.514626 25.921297 37.186021 0.500000
std 8.128933 10.304064 2745.768525 7395.553107 14.478635 17.369773 1.153801 36.110244 0.765058 0.637896 ... 0.144907 0.022739 0.029988 0.153256 469.745918 585.113127 136.080745 51.167509 127.638560 0.500013
min 0.000000 16.710000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 -0.010000 -0.010000 0.000000 -559.990000 -140.940000 -187.120000 -447.680000 -223.890000 0.000000
25% 10.390000 73.250000 4233.330000 6375.000000 29.727500 32.000000 1.140000 86.365000 1.000000 0.942750 ... 0.020000 0.000000 0.000000 0.060000 0.000000 0.000000 13.380000 0.000000 0.000000 0.000000
50% 15.190000 80.470000 6213.560000 9844.605000 36.400000 40.625000 1.780000 112.555000 1.473000 1.250000 ... 0.040000 0.000000 0.010000 0.090000 102.300000 121.820000 56.170000 9.580000 0.000000 0.500000
75% 21.382500 86.640000 8028.440000 13772.230000 47.170000 53.480000 2.500000 135.845000 1.923000 1.600000 ... 0.080000 0.010000 0.020000 0.140000 337.015000 486.565000 134.022500 31.960000 44.852500 1.000000
max 100.000000 100.000000 26862.500000 132787.500000 128.000000 128.000000 11.080000 319.670000 10.000000 6.000000 ... 4.920000 0.680000 0.770000 2.340000 26625.900000 9094.610000 1682.140000 1312.530000 14423.670000 1.000000

8 rows × 31 columns

In [138]:
#Plotting heatmap/correlation table and finding the highly correlated variables to drop
#Doing the same as we deed before

plt.subplots(figsize=(70,70))
corr3 = train_set_OS.corr()
sb.heatmap(corr3, annot=True,)
cor_matrix3 = train_set_OS.corr().abs()
upper_tri3 = cor_matrix3.where(np.triu(np.ones(cor_matrix3.shape),k=1).astype(np.bool))
to_drop_OS = [column for column in upper_tri3.columns if any(upper_tri3[column] > 0.75)]
to_drop_OS
/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:8: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  
Out[138]:
['A2', 'A6', 'A10', 'A11', 'A12', 'A18', 'A19', 'A24']
In [140]:
#Creting reduced train set for Ov Samp
train_set_OS_reduced=train_set_US.drop(columns=to_drop_OS)

#Crearing train inputs for OS data 
X_train_OS=train_set_OS_reduced.loc[:,train_set_OS_reduced.columns!='default']
Y_train_OS=train_set_OS_reduced['default']


# building Log Reg model with new vector X for US

logit_model_OS=sm.Logit(Y_train_OS,X_train_OS)
result=logit_model_OS.fit()
print(result.summary2())
Optimization terminated successfully.
         Current function value: 0.525519
         Iterations 7
                          Results: Logit
==================================================================
Model:              Logit            Pseudo R-squared: 0.242      
Dependent Variable: default          AIC:              3197.1126  
Date:               2022-04-08 16:32 BIC:              3329.2527  
No. Observations:   3000             Log-Likelihood:   -1576.6    
Df Model:           21               LL-Null:          -2079.4    
Df Residuals:       2978             LLR p-value:      1.6505e-199
Converged:          1.0000           Scale:            1.0000     
No. Iterations:     7.0000                                        
---------------------------------------------------------------------
        Coef.     Std.Err.       z       P>|z|      [0.025     0.975]
---------------------------------------------------------------------
A1      0.0934      0.0079    11.7463    0.0000     0.0778     0.1090
A3     -0.0001      0.0000    -2.7201    0.0065    -0.0001    -0.0000
A4     -0.0000      0.0000    -0.1718    0.8636    -0.0000     0.0000
A5      0.0213      0.0062     3.4136    0.0006     0.0091     0.0336
A7      0.1297      0.0505     2.5685    0.0102     0.0307     0.2286
A8     -0.0097      0.0016    -5.9868    0.0000    -0.0128    -0.0065
A9     -0.1514      0.0789    -1.9181    0.0551    -0.3060     0.0033
A13    -0.0066      0.0046    -1.4174    0.1564    -0.0157     0.0025
A14    -0.0048      0.0054    -0.9026    0.3668    -0.0154     0.0057
A15    -0.0000      0.0000    -2.0303    0.0423    -0.0000    -0.0000
A16     0.3404      0.1637     2.0800    0.0375     0.0196     0.6612
A17     0.0000      0.0000     0.3416    0.7327    -0.0001     0.0001
A20    -0.1107      0.0773    -1.4322    0.1521    -0.2621     0.0408
A21    -0.0000      0.0003    -0.0167    0.9867    -0.0006     0.0006
A22     0.1676      0.5571     0.3009    0.7635    -0.9242     1.2594
A23    -1.6862      3.2159    -0.5243    0.6001    -7.9891     4.6168
A25     0.0426      0.4037     0.1054    0.9160    -0.7487     0.8338
A26    -0.0001      0.0002    -0.3078    0.7583    -0.0004     0.0003
A27    -0.0001      0.0001    -1.2409    0.2146    -0.0003     0.0001
A28     0.0006      0.0004     1.3996    0.1616    -0.0002     0.0014
A29     0.0004      0.0010     0.4404    0.6596    -0.0015     0.0024
A30    -0.0002      0.0007    -0.2291    0.8188    -0.0015     0.0011
==================================================================

In [148]:
#dropping variables based on p-values
X_train_OS_2=X_train_US[['A1','A3','A5','A7','A8','A15','A16']]
# building Log Reg model with new vector X for US

logit_model_OS_2=sm.Logit(Y_train_OS,X_train_OS_2)
result=logit_model_OS_2.fit()
print(result.summary2())
Optimization terminated successfully.
         Current function value: 0.527557
         Iterations 6
                          Results: Logit
==================================================================
Model:              Logit            Pseudo R-squared: 0.239      
Dependent Variable: default          AIC:              3179.3435  
Date:               2022-04-08 16:37 BIC:              3221.3880  
No. Observations:   3000             Log-Likelihood:   -1582.7    
Df Model:           6                LL-Null:          -2079.4    
Df Residuals:       2993             LLR p-value:      2.2317e-211
Converged:          1.0000           Scale:            1.0000     
No. Iterations:     6.0000                                        
---------------------------------------------------------------------
        Coef.     Std.Err.       z       P>|z|      [0.025     0.975]
---------------------------------------------------------------------
A1      0.0897      0.0075    11.9334    0.0000     0.0749     0.1044
A3     -0.0001      0.0000    -3.5288    0.0004    -0.0001    -0.0000
A5      0.0104      0.0033     3.1194    0.0018     0.0039     0.0170
A7      0.1021      0.0473     2.1572    0.0310     0.0093     0.1948
A8     -0.0101      0.0015    -6.8005    0.0000    -0.0130    -0.0072
A15    -0.0000      0.0000    -4.4367    0.0000    -0.0000    -0.0000
A16     0.3508      0.1603     2.1887    0.0286     0.0367     0.6649
==================================================================

In [149]:
#All variables are significant
In [164]:
# Log Reg on OS data
logreg_OS= LogisticRegression()
logreg_OS.fit(X_train_OS_2, Y_train_OS)
Out[164]:
LogisticRegression()
In [151]:
#AUC for train_set (Logistic Reg OS data)

logit_roc_auc = roc_auc_score(Y_train_OS, logreg_OS.predict(X_train_OS_2))
fpr, tpr, thresholds = roc_curve(Y_train_OS, logreg_OS.predict_proba(X_train_OS_2)[:,1])
plt.figure(figsize=(20,20))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
In [153]:
#AUC for test_set (Logistic Reg OS data)
X_test_OS=loan_data_test[['A1','A3','A5','A7','A8', 'A15','A16']]
In [155]:
logit_roc_auc = roc_auc_score(Y_test, logreg_OS.predict(X_test_OS))
fpr, tpr, thresholds = roc_curve(Y_test, logreg_OS.predict_proba(X_test_OS)[:,1])
plt.figure(figsize=(20,20))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
In [ ]: